In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from joblib import dump, load 

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.float_format', lambda x: '%.3f' % x)
In [2]:
data = pd.read_csv('historical_data.csv')
In [3]:
data.shape
Out[3]:
(197428, 16)
In [4]:
data.describe()
Out[4]:
market_id store_id order_protocol total_items subtotal num_distinct_items min_item_price max_item_price total_onshift_dashers total_busy_dashers total_outstanding_orders estimated_order_place_duration estimated_store_to_consumer_driving_duration
count 196441.000 197428.000 196433.000 197428.000 197428.000 197428.000 197428.000 197428.000 181166.000 181166.000 181166.000 197428.000 196902.000
mean 2.979 3530.510 2.882 3.196 2682.331 2.671 686.218 1159.589 44.808 41.740 58.050 308.560 545.359
std 1.525 2053.497 1.504 2.667 1823.094 1.630 522.039 558.411 34.527 32.146 52.662 90.140 219.353
min 1.000 1.000 1.000 1.000 0.000 1.000 -86.000 0.000 -4.000 -5.000 -6.000 0.000 0.000
25% 2.000 1686.000 1.000 2.000 1400.000 1.000 299.000 800.000 17.000 15.000 17.000 251.000 382.000
50% 3.000 3592.000 3.000 3.000 2200.000 2.000 595.000 1095.000 37.000 34.000 41.000 251.000 544.000
75% 4.000 5299.000 4.000 4.000 3395.000 3.000 949.000 1395.000 65.000 62.000 85.000 446.000 702.000
max 6.000 6987.000 7.000 411.000 27100.000 20.000 14700.000 14700.000 171.000 154.000 285.000 2715.000 2088.000

Feature Engineering - I

In [5]:
# Change data type to date and create a new col: delivery_time_secs which would be the model target
# Convert UTC to PST just to understand the data more in local time
data['created_at'] = pd.to_datetime(data['created_at']).dt.tz_localize('utc').dt.tz_convert('US/Pacific')
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time']).dt.tz_localize('utc').dt.tz_convert('US/Pacific')
data['delivery_time_secs'] = (data['actual_delivery_time'] - data['created_at']).dt.total_seconds()
In [6]:
# Create additional date-time features based on created_at
data['hour'] = data['created_at'].dt.hour
data['day'] = data['created_at'].dt.dayofweek
# Different Buckets: Early morn, breakfast, lunch, snack, dinner etc
# 0-6 means: starting from 00:00 to 05:59
data['hour_0_6'] = data['hour'].map(lambda x: 1 if 0 <= x < 6 else 0)
data['hour_6_11'] = data['hour'].map(lambda x: 1 if 6 <= x < 11 else 0)
data['hour_11_15'] = data['hour'].map(lambda x: 1 if 11 <= x < 15 else 0)
data['hour_15_18'] = data['hour'].map(lambda x: 1 if 15 <= x < 18 else 0)
data['hour_18_23'] = data['hour'].map(lambda x: 1 if 18 <= x < 23 else 0)
In [7]:
data['avg_price_item'] = data['subtotal'] / data['total_items']
data['avg_price_distinct_item'] = data['subtotal'] / data['num_distinct_items']
In [8]:
data.head()
Out[8]:
market_id created_at actual_delivery_time store_id store_primary_category order_protocol total_items subtotal num_distinct_items min_item_price ... delivery_time_secs hour day hour_0_6 hour_6_11 hour_11_15 hour_15_18 hour_18_23 avg_price_item avg_price_distinct_item
0 1.000 2015-02-06 14:24:17-08:00 2015-02-06 15:27:16-08:00 1845 american 1.000 4 3441 4 557 ... 3779.000 14 4 0 0 1 0 0 860.250 860.250
1 2.000 2015-02-10 13:49:25-08:00 2015-02-10 14:56:29-08:00 5477 mexican 2.000 1 1900 1 1400 ... 4024.000 13 1 0 0 1 0 0 1900.000 1900.000
2 3.000 2015-01-22 12:39:28-08:00 2015-01-22 13:09:09-08:00 5477 NaN 1.000 1 1900 1 1900 ... 1781.000 12 3 0 0 1 0 0 1900.000 1900.000
3 3.000 2015-02-03 13:21:45-08:00 2015-02-03 14:13:00-08:00 5477 NaN 1.000 6 6900 5 600 ... 3075.000 13 1 0 0 1 0 0 1150.000 1380.000
4 3.000 2015-02-14 18:40:36-08:00 2015-02-14 19:20:26-08:00 5477 NaN 1.000 3 3900 3 1100 ... 2390.000 18 5 0 0 0 0 1 1300.000 1300.000

5 rows × 26 columns

In [9]:
data.dtypes
Out[9]:
market_id                                                          float64
created_at                                      datetime64[ns, US/Pacific]
actual_delivery_time                            datetime64[ns, US/Pacific]
store_id                                                             int64
store_primary_category                                              object
order_protocol                                                     float64
total_items                                                          int64
subtotal                                                             int64
num_distinct_items                                                   int64
min_item_price                                                       int64
max_item_price                                                       int64
total_onshift_dashers                                              float64
total_busy_dashers                                                 float64
total_outstanding_orders                                           float64
estimated_order_place_duration                                       int64
estimated_store_to_consumer_driving_duration                       float64
delivery_time_secs                                                 float64
hour                                                                 int64
day                                                                  int64
hour_0_6                                                             int64
hour_6_11                                                            int64
hour_11_15                                                           int64
hour_15_18                                                           int64
hour_18_23                                                           int64
avg_price_item                                                     float64
avg_price_distinct_item                                            float64
dtype: object

Exploratory Data Analysis

In [10]:
data['delivery_time_secs'].describe()
Out[10]:
count    197421.000
mean       2908.257
std       19229.609
min         101.000
25%        2104.000
50%        2660.000
75%        3381.000
max     8516859.000
Name: delivery_time_secs, dtype: float64
In [11]:
data['delivery_time_secs'].quantile([ i*1.0/100 for i in range(0,100,5)])
Out[11]:
0.000    101.000
0.050   1492.000
0.100   1699.000
0.150   1855.000
0.200   1986.000
0.250   2104.000
0.300   2217.000
0.350   2328.000
0.400   2436.000
0.450   2546.000
0.500   2660.000
0.550   2779.000
0.600   2906.000
0.650   3047.000
0.700   3200.000
0.750   3381.000
0.800   3594.000
0.850   3862.000
0.900   4235.000
0.950   4872.000
Name: delivery_time_secs, dtype: float64
In [12]:
data[data['delivery_time_secs'] > 3600*12].sort_values(by=['delivery_time_secs'],ascending=False)
Out[12]:
market_id created_at actual_delivery_time store_id store_primary_category order_protocol total_items subtotal num_distinct_items min_item_price ... delivery_time_secs hour day hour_0_6 hour_6_11 hour_11_15 hour_15_18 hour_18_23 avg_price_item avg_price_distinct_item
2690 1.000 2014-10-18 22:24:15-07:00 2015-01-25 11:11:54-08:00 3560 italian 1.000 1 1695 1 1595 ... 8516859.000 22 5 0 0 0 0 1 1695.000 1695.000
185550 4.000 2015-01-28 00:34:06-08:00 2015-02-01 08:25:25-08:00 6503 dessert 5.000 3 1520 3 220 ... 373879.000 0 2 1 0 0 0 0 506.667 506.667
27189 1.000 2015-02-15 18:24:09-08:00 2015-02-19 14:45:31-08:00 4338 indian 3.000 4 4980 4 995 ... 332482.000 18 6 0 0 0 0 1 1245.000 1245.000
83055 2.000 2015-01-31 18:18:07-08:00 2015-02-01 10:08:39-08:00 355 burger 4.000 3 2379 2 389 ... 57032.000 18 5 0 0 0 0 1 793.000 1189.500
190860 1.000 2015-02-15 18:31:05-08:00 2015-02-16 09:38:32-08:00 2169 indian 3.000 4 3660 4 375 ... 54447.000 18 6 0 0 0 0 1 915.000 915.000
86952 3.000 2015-02-04 18:11:40-08:00 2015-02-05 07:34:38-08:00 1764 thai 2.000 3 2185 3 495 ... 48178.000 18 2 0 0 0 0 1 728.333 728.333
76743 2.000 2015-02-14 20:17:35-08:00 2015-02-15 08:59:00-08:00 3247 pizza 6.000 1 990 1 795 ... 45685.000 20 5 0 0 0 0 1 990.000 990.000

7 rows × 26 columns

In [13]:
data.loc[data['delivery_time_secs'] <= 4872, ['delivery_time_secs','hour']].groupby('hour').mean().plot(kind='bar')
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a210f3390>
In [14]:
# Outliers in terms of delivery time secs
print(data.shape)
for i in range(2,10+1):
    print('More than', i, 'hours',data.loc[data['delivery_time_secs'] > 3600*i].shape)
(197428, 26)
More than 2 hours (1090, 26)
More than 3 hours (138, 26)
More than 4 hours (58, 26)
More than 5 hours (35, 26)
More than 6 hours (25, 26)
More than 7 hours (18, 26)
More than 8 hours (14, 26)
More than 9 hours (13, 26)
More than 10 hours (11, 26)
In [15]:
# plot the heatmap of corr coeffs to understand how each predictor is corr with target and with each other
corr = data.corr()
fig, ax = plt.subplots(figsize=(20,20))         # Sample figsize in inches
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        annot=True,
        ax=ax)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e5f7278>
In [16]:
cols = ['market_id',
        'store_id',
        'day',
        'hour',
        'estimated_store_to_consumer_driving_duration',
        'total_onshift_dashers',
        'total_busy_dashers',
        'total_outstanding_orders',
        'subtotal',
        'delivery_time_secs']
sns.pairplot(data[cols])
/Users/ssatpati/anaconda3/lib/python3.6/site-packages/numpy/lib/histograms.py:754: RuntimeWarning: invalid value encountered in greater_equal
  keep = (tmp_a >= first_edge)
/Users/ssatpati/anaconda3/lib/python3.6/site-packages/numpy/lib/histograms.py:755: RuntimeWarning: invalid value encountered in less_equal
  keep &= (tmp_a <= last_edge)
Out[16]:
<seaborn.axisgrid.PairGrid at 0x1a1d85a8d0>
In [74]:
fig = plt.figure(figsize=(10, 5), dpi=80)
ax = fig.add_subplot(2, 3, 1)
sns.regplot(x="hour", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 2)
sns.regplot(x="day", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 3)
sns.regplot(x="total_onshift_dashers", y="delivery_time_secs", data=data, ax=ax)

ax = fig.add_subplot(2, 3, 4)
sns.regplot(x="market_id", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 5)
sns.regplot(x="store_id", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 6)
sns.regplot(x="total_outstanding_orders", y="delivery_time_secs", data=data, ax=ax)
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a4798f940>

Handle Missing Data

In [17]:
data.isna().sum()
Out[17]:
market_id                                         987
created_at                                          0
actual_delivery_time                                7
store_id                                            0
store_primary_category                           4760
order_protocol                                    995
total_items                                         0
subtotal                                            0
num_distinct_items                                  0
min_item_price                                      0
max_item_price                                      0
total_onshift_dashers                           16262
total_busy_dashers                              16262
total_outstanding_orders                        16262
estimated_order_place_duration                      0
estimated_store_to_consumer_driving_duration      526
delivery_time_secs                                  7
hour                                                0
day                                                 0
hour_0_6                                            0
hour_6_11                                           0
hour_11_15                                          0
hour_15_18                                          0
hour_18_23                                          0
avg_price_item                                      0
avg_price_distinct_item                             0
dtype: int64

actual_delivery_time

In [18]:
#These records can be dropped?
print(data.shape)
data.drop(data[data['delivery_time_secs'].isna()].index, inplace=True)
print(data.shape)
(197428, 26)
(197421, 26)

market_id

In [19]:
data[data['market_id'].isna()].head()
Out[19]:
market_id created_at actual_delivery_time store_id store_primary_category order_protocol total_items subtotal num_distinct_items min_item_price ... delivery_time_secs hour day hour_0_6 hour_6_11 hour_11_15 hour_15_18 hour_18_23 avg_price_item avg_price_distinct_item
45 nan 2015-02-08 19:27:37-08:00 2015-02-08 20:22:18-08:00 5054 italian 1.000 2 2400 2 600 ... 3281.000 19 6 0 0 0 0 1 1200.000 1200.000
182 nan 2015-01-31 21:32:34-08:00 2015-01-31 22:01:21-08:00 5081 mediterranean 3.000 2 1845 2 600 ... 1727.000 21 5 0 0 0 0 1 922.500 922.500
970 nan 2015-02-16 18:17:43-08:00 2015-02-16 19:15:14-08:00 1148 NaN nan 3 3400 3 1000 ... 3451.000 18 0 0 0 0 0 1 1133.333 1133.333
1126 nan 2015-02-17 19:50:52-08:00 2015-02-17 20:15:09-08:00 1904 sandwich 2.000 1 1395 1 795 ... 1457.000 19 1 0 0 0 0 1 1395.000 1395.000
1625 nan 2015-02-16 19:49:46-08:00 2015-02-16 20:21:27-08:00 283 pizza nan 1 1699 1 1399 ... 1901.000 19 0 0 0 0 0 1 1699.000 1699.000

5 rows × 26 columns

In [20]:
data['market_id'].value_counts()
Out[20]:
2.000    55055
4.000    47597
1.000    38037
3.000    23296
5.000    17999
6.000    14450
Name: market_id, dtype: int64
In [21]:
# Create a market: unknown (id=99)
data['market_id'] = np.where(data['market_id'].isna(), 99, data['market_id'])
In [22]:
data['market_id'].value_counts()
Out[22]:
2.000     55055
4.000     47597
1.000     38037
3.000     23296
5.000     17999
6.000     14450
99.000      987
Name: market_id, dtype: int64

store_primary_category

In [23]:
data[data['store_primary_category'].isna()].head()
Out[23]:
market_id created_at actual_delivery_time store_id store_primary_category order_protocol total_items subtotal num_distinct_items min_item_price ... delivery_time_secs hour day hour_0_6 hour_6_11 hour_11_15 hour_15_18 hour_18_23 avg_price_item avg_price_distinct_item
2 3.000 2015-01-22 12:39:28-08:00 2015-01-22 13:09:09-08:00 5477 NaN 1.000 1 1900 1 1900 ... 1781.000 12 3 0 0 1 0 0 1900.000 1900.000
3 3.000 2015-02-03 13:21:45-08:00 2015-02-03 14:13:00-08:00 5477 NaN 1.000 6 6900 5 600 ... 3075.000 13 1 0 0 1 0 0 1150.000 1380.000
4 3.000 2015-02-14 18:40:36-08:00 2015-02-14 19:20:26-08:00 5477 NaN 1.000 3 3900 3 1100 ... 2390.000 18 5 0 0 0 0 1 1300.000 1300.000
5 3.000 2015-01-28 12:30:38-08:00 2015-01-28 13:08:58-08:00 5477 NaN 1.000 3 5000 3 1500 ... 2300.000 12 2 0 0 1 0 0 1666.667 1666.667
6 3.000 2015-01-30 18:16:36-08:00 2015-01-30 18:43:00-08:00 5477 NaN 1.000 2 3900 2 1200 ... 1584.000 18 4 0 0 0 0 1 1950.000 1950.000

5 rows × 26 columns

In [24]:
# Set to "other" wherever NA
data['store_primary_category'] = np.where(data['store_primary_category'].isna(), 'other', data['store_primary_category'])
In [25]:
data['store_primary_category'] = data['store_primary_category'].astype('category')
In [26]:
data['store_primary_category_cat'] = data['store_primary_category'].cat.codes

order_protocol

In [27]:
data[data['order_protocol'].isna()].head()
Out[27]:
market_id created_at actual_delivery_time store_id store_primary_category order_protocol total_items subtotal num_distinct_items min_item_price ... hour day hour_0_6 hour_6_11 hour_11_15 hour_15_18 hour_18_23 avg_price_item avg_price_distinct_item store_primary_category_cat
32 1.000 2015-01-28 12:33:04-08:00 2015-01-28 13:04:14-08:00 4149 other nan 3 1765 3 275 ... 12 2 0 0 1 0 0 588.333 588.333 51
179 4.000 2015-01-24 11:48:45-08:00 2015-01-24 12:31:14-08:00 5081 mediterranean nan 2 2070 2 325 ... 11 5 0 0 1 0 0 1035.000 1035.000 46
575 2.000 2015-02-10 11:36:38-08:00 2015-02-10 12:24:14-08:00 2716 other nan 9 4781 8 379 ... 11 1 0 0 1 0 0 531.222 597.625 51
970 99.000 2015-02-16 18:17:43-08:00 2015-02-16 19:15:14-08:00 1148 other nan 3 3400 3 1000 ... 18 0 0 0 0 0 1 1133.333 1133.333 51
1035 4.000 2015-02-08 18:10:29-08:00 2015-02-08 18:59:20-08:00 5171 other nan 2 2790 2 1295 ... 18 6 0 0 0 0 1 1395.000 1395.000 51

5 rows × 27 columns

In [28]:
# Create a order_protocol: unknown (id=99)
data['order_protocol'] = np.where(data['order_protocol'].isna(), 99, data['order_protocol'])
In [29]:
data['order_protocol'].value_counts()
Out[29]:
1.000     54723
3.000     53197
5.000     44289
2.000     24051
4.000     19353
99.000      995
6.000       794
7.000        19
Name: order_protocol, dtype: int64

estimated_store_to_consumer_driving_duration - (Handled in Model 2)

total_onshift_dashers, total_busy_dashers, total_outstanding_orders

In [30]:
data.loc[data['total_onshift_dashers'].isna()].head()
Out[30]:
market_id created_at actual_delivery_time store_id store_primary_category order_protocol total_items subtotal num_distinct_items min_item_price ... hour day hour_0_6 hour_6_11 hour_11_15 hour_15_18 hour_18_23 avg_price_item avg_price_distinct_item store_primary_category_cat
160 6.000 2015-02-05 17:11:56-08:00 2015-02-05 17:42:51-08:00 976 breakfast 2.000 2 575 2 225 ... 17 3 0 0 0 1 0 287.500 287.500 10
161 6.000 2015-02-13 18:07:47-08:00 2015-02-13 19:17:37-08:00 976 breakfast 2.000 5 1415 3 185 ... 18 4 0 0 0 0 1 283.000 471.667 10
162 6.000 2015-01-31 13:58:30-08:00 2015-01-31 14:55:32-08:00 976 breakfast 2.000 1 650 1 650 ... 13 5 0 0 1 0 0 650.000 650.000 10
163 6.000 2015-02-07 19:28:59-08:00 2015-02-07 21:32:11-08:00 976 breakfast 2.000 5 1550 5 225 ... 19 5 0 0 0 0 1 310.000 310.000 10
164 6.000 2015-01-23 11:29:17-08:00 2015-01-23 12:25:25-08:00 976 breakfast 2.000 6 1110 5 185 ... 11 4 0 0 1 0 0 185.000 222.000 10

5 rows × 27 columns

In [31]:
"""
Create 3 maps:
key: ('market_id','store_id','day','hour')
value: average of total_onshift_dashers, total_busy_dashers or total_outstanding_orders 
"""
dashers_map = \
        data[['market_id','store_id','day','hour','total_onshift_dashers','total_busy_dashers','total_outstanding_orders']] \
        .groupby(['market_id','store_id','day','hour']).mean().to_dict()
print(dashers_map.keys())
total_onshift_dashers_map = dashers_map['total_onshift_dashers']
total_busy_dashers_map = dashers_map['total_busy_dashers']
total_outstanding_orders_map = dashers_map['total_outstanding_orders']
for k,v in total_onshift_dashers_map.items():
    print(k,v)
    break
for k,v in total_busy_dashers_map.items():
    print(k,v)
    break
for k,v in total_outstanding_orders_map.items():
    print(k,v)
    break
dict_keys(['total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders'])
(1.0, 2, 0, 18) 23.0
(1.0, 2, 0, 18) 24.0
(1.0, 2, 0, 18) 37.0
In [32]:
# Replace the NAs with average store to consumer driving duration
def total_onshift_dashers_missing(row):
    ret = row['total_onshift_dashers']
    if np.isnan(ret):
        ret = total_onshift_dashers_map[(row['market_id'],row['store_id'],row['day'],row['hour'])]
    return ret
        
print(data[data['total_onshift_dashers'].isna()].shape)
"""
data['total_onshift_dashers'] = \
    data[['market_id','store_id','day','hour','total_onshift_dashers']] \
    .apply(total_onshift_dashers_missing, axis=1)
"""
(16262, 27)
Out[32]:
"\ndata['total_onshift_dashers'] =     data[['market_id','store_id','day','hour','total_onshift_dashers']]     .apply(total_onshift_dashers_missing, axis=1)\n"
In [33]:
df_missing_dashers = data.loc[data['total_onshift_dashers'].isna()]
In [34]:
df_missing_dashers['hour'].value_counts().sort_index().plot(kind='bar')
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1fdd03c8>
In [35]:
df_missing_dashers['delivery_time_secs'].quantile([ i*1.0/100 for i in range(0,100,5)])
Out[35]:
0.000    557.000
0.050   1480.000
0.100   1683.000
0.150   1834.000
0.200   1970.000
0.250   2092.000
0.300   2203.000
0.350   2315.000
0.400   2421.000
0.450   2527.000
0.500   2635.000
0.550   2755.000
0.600   2885.000
0.650   3028.000
0.700   3184.000
0.750   3365.000
0.800   3587.800
0.850   3873.850
0.900   4245.900
0.950   4919.950
Name: delivery_time_secs, dtype: float64
In [36]:
# Fill these cols with 0 (for now)
data[['total_onshift_dashers','total_busy_dashers','total_outstanding_orders']] = \
    data[['total_onshift_dashers','total_busy_dashers','total_outstanding_orders']].fillna(0)
In [37]:
# Missing value at this point (Base line Data set)
# Only estimated_store_to_consumer_driving_duration has missing values (to be taken care at modeling stage)
data.isna().sum()
Out[37]:
market_id                                         0
created_at                                        0
actual_delivery_time                              0
store_id                                          0
store_primary_category                            0
order_protocol                                    0
total_items                                       0
subtotal                                          0
num_distinct_items                                0
min_item_price                                    0
max_item_price                                    0
total_onshift_dashers                             0
total_busy_dashers                                0
total_outstanding_orders                          0
estimated_order_place_duration                    0
estimated_store_to_consumer_driving_duration    526
delivery_time_secs                                0
hour                                              0
day                                               0
hour_0_6                                          0
hour_6_11                                         0
hour_11_15                                        0
hour_15_18                                        0
hour_18_23                                        0
avg_price_item                                    0
avg_price_distinct_item                           0
store_primary_category_cat                        0
dtype: int64
In [38]:
data.dtypes
Out[38]:
market_id                                                          float64
created_at                                      datetime64[ns, US/Pacific]
actual_delivery_time                            datetime64[ns, US/Pacific]
store_id                                                             int64
store_primary_category                                            category
order_protocol                                                     float64
total_items                                                          int64
subtotal                                                             int64
num_distinct_items                                                   int64
min_item_price                                                       int64
max_item_price                                                       int64
total_onshift_dashers                                              float64
total_busy_dashers                                                 float64
total_outstanding_orders                                           float64
estimated_order_place_duration                                       int64
estimated_store_to_consumer_driving_duration                       float64
delivery_time_secs                                                 float64
hour                                                                 int64
day                                                                  int64
hour_0_6                                                             int64
hour_6_11                                                            int64
hour_11_15                                                           int64
hour_15_18                                                           int64
hour_18_23                                                           int64
avg_price_item                                                     float64
avg_price_distinct_item                                            float64
store_primary_category_cat                                            int8
dtype: object

Model - Baseline

Using Gradient Boosted Decision Trees

In [39]:
def train_test_split_df(dataframe):
    # Copy dataframe
    data_processed = dataframe.copy(deep=True)
    
    y = data_processed['delivery_time_secs']
    drop_cols = ['created_at','actual_delivery_time','delivery_time_secs','store_primary_category']
    X = data_processed.drop(drop_cols, axis=1)
    
    X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X, y, test_size=0.25, random_state=43)
    print(X_train_df.shape, y_train_df.shape)
    print(X_test_df.shape, y_test_df.shape)
    
    return X_train_df, X_test_df, y_train_df, y_test_df, data_processed.columns

def train_test_split_np(dataframe):
    # Copy dataframe
    data_processed = dataframe.copy(deep=True)
    
    y = data_processed['delivery_time_secs'].values
    drop_cols = ['created_at','actual_delivery_time','delivery_time_secs','store_primary_category']
    data_processed.drop(drop_cols, axis=1, inplace=True)
    X = data_processed.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=43)
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    return X_train, X_test, y_train, y_test, data_processed.columns

def train_X_y(dataframe):
    # Copy dataframe
    data_processed = dataframe.copy(deep=True)
    
    y = data_processed['delivery_time_secs'].values
    drop_cols = ['created_at','actual_delivery_time','delivery_time_secs','store_primary_category']
    data_processed.drop(drop_cols, axis=1, inplace=True)
    X = data_processed.values
    
    print(X.shape, y.shape)
    
    return X, y, data_processed.columns

def train_model(X_train, y_train):
    
    # Log Transform the target
    y_train = np.log(y_train)
    
    param_grid = {
        'n_estimators': [200],
        'min_samples_split': [500],
        'min_samples_leaf': [100],
        'max_depth': [7]
    }
    
    gbdt_reg = GradientBoostingRegressor(verbose=10)
    print(gbdt_reg)
    
    
    grid_search = GridSearchCV(gbdt_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print('# Best Params', grid_search.best_params_)
    print('# Best Score', grid_search.best_score_)
    
    # Model Score
    print('# Mean Squared Error (Best Score)', np.exp(np.sqrt(-grid_search.best_score_)), 'secs')
    
    return grid_search.best_estimator_

def predict(estimator, X_test, y_test):
    #Model Prediction on Test Set
    y_hat = estimator.predict(X_test)
    
    # Log Transform y_test
    y_test = np.log(y_test)
    
    mse = mean_squared_error(y_test, y_hat)
    mae = mean_absolute_error(y_test, y_hat)
    print('MSE', mse)
    print('MAE', mae)
    print('# Prediction Error (RMSE)', np.exp(np.sqrt(mse)), 'secs')
    print('# Prediction Error (MAE)', np.exp(mae), 'secs')

def train_model_orig(dataframe):
    # Train on copied dataframe
    data_processed = dataframe.copy(deep=True)
    
    # Log Transform the target
    y = np.log(data_processed['delivery_time_secs'].values)
    print(y.shape)
    
    drop_cols = ['created_at','actual_delivery_time','delivery_time_secs','store_primary_category']
    data_processed.drop(drop_cols, axis=1, inplace=True)

    # Predictors
    X = data_processed.values
    X.shape
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=43)
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    param_grid = {
        'n_estimators': [200],
        'min_samples_split': [500],
        'min_samples_leaf': [100],
        'max_depth': [7]
    }
    
    gbdt_reg = GradientBoostingRegressor(verbose=10)
    print(gbdt_reg)
    
    
    grid_search = GridSearchCV(gbdt_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print('# Best Params', grid_search.best_params_)
    print('# Best Score', grid_search.best_score_)
    
    # Model Score
    print('# Mean Squared Error (Best Score)', np.exp(np.sqrt(-grid_search.best_score_)), 'secs')
    print('# Mean Squared Error (Best Score)', np.exp(np.sqrt(-grid_search.best_score_))/60, 'mins')
    print('# Mean Squared Error (Best Score)', np.exp(np.sqrt(-grid_search.best_score_))/3600, 'hrs')
    
    #Model Prediction on Test Set
    y_hat = grid_search.best_estimator_.predict(X_test)
    mse = mean_squared_error(y_test, y_hat)
    print('# Prediction Error', np.exp(np.sqrt(mse)), 'secs')
    print('# Prediction Error', np.exp(np.sqrt(mse))/60, 'mins')
    print('# Prediction Error', np.exp(np.sqrt(mse))/3600, 'hrs')
    
    return grid_search.best_estimator_, data_processed.columns
    
In [40]:
def plot_feature_importance(estimator, features):
    feature_importances__reg = estimator.feature_importances_
    indices = np.argsort(feature_importances__reg)[::-1]
    # Plot feature importance
    fig = plt.figure(figsize=(20, 10), dpi=80)
    # make importances relative to max importance
    feature_importances__reg = 100.0 * (feature_importances__reg / feature_importances__reg.max())
    sorted_idx = np.argsort(feature_importances__reg)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.subplot(1, 2, 1)
    plt.barh(pos, feature_importances__reg[sorted_idx], align='center')
    plt.yticks(pos, np.array(features)[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Feature Importance')
In [41]:
# Make a copy of baseline dataset
data_processed_1 = data.copy(deep=True)
# Handle missing value
data_processed_1['estimated_store_to_consumer_driving_duration'] = \
    data_processed_1['estimated_store_to_consumer_driving_duration'].fillna(0)
In [42]:
X_train, X_test, y_train, y_test, features = train_test_split_np(data_processed_1)
(148065, 23) (148065,)
(49356, 23) (49356,)
In [43]:
best_estimator_ = train_model(X_train, y_train)
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=10, warm_start=False)
      Iter       Train Loss   Remaining Time 
         1           0.1267            3.77m
         2           0.1220            3.70m
         3           0.1183            3.59m
         4           0.1150            3.53m
         5           0.1122            3.46m
         6           0.1098            3.43m
         7           0.1075            3.37m
         8           0.1055            3.32m
         9           0.1036            3.27m
        10           0.1020            3.22m
        11           0.1004            3.18m
        12           0.0991            3.14m
        13           0.0981            3.08m
        14           0.0971            3.05m
        15           0.0961            3.01m
        16           0.0951            2.97m
        17           0.0944            2.92m
        18           0.0938            2.88m
        19           0.0932            2.86m
        20           0.0924            2.83m
        21           0.0919            2.79m
        22           0.0915            2.75m
        23           0.0910            2.71m
        24           0.0906            2.68m
        25           0.0901            2.65m
        26           0.0897            2.62m
        27           0.0894            2.59m
        28           0.0889            2.57m
        29           0.0886            2.55m
        30           0.0883            2.51m
        31           0.0878            2.49m
        32           0.0875            2.47m
        33           0.0872            2.43m
        34           0.0868            2.40m
        35           0.0866            2.37m
        36           0.0863            2.35m
        37           0.0860            2.33m
        38           0.0857            2.30m
        39           0.0854            2.28m
        40           0.0851            2.26m
        41           0.0848            2.25m
        42           0.0847            2.22m
        43           0.0845            2.19m
        44           0.0842            2.18m
        45           0.0839            2.16m
        46           0.0838            2.13m
        47           0.0835            2.12m
        48           0.0833            2.09m
        49           0.0831            2.08m
        50           0.0830            2.05m
        51           0.0828            2.03m
        52           0.0826            2.01m
        53           0.0824            2.00m
        54           0.0823            1.97m
        55           0.0822            1.95m
        56           0.0821            1.94m
        57           0.0820            1.91m
        58           0.0818            1.89m
        59           0.0817            1.87m
        60           0.0815            1.85m
        61           0.0813            1.84m
        62           0.0811            1.82m
        63           0.0810            1.80m
        64           0.0809            1.78m
        65           0.0808            1.77m
        66           0.0806            1.75m
        67           0.0806            1.73m
        68           0.0804            1.71m
        69           0.0803            1.69m
        70           0.0802            1.67m
        71           0.0801            1.66m
        72           0.0800            1.65m
        73           0.0799            1.62m
        74           0.0798            1.61m
        75           0.0797            1.60m
        76           0.0796            1.58m
        77           0.0795            1.57m
        78           0.0794            1.55m
        79           0.0792            1.54m
        80           0.0792            1.52m
        81           0.0791            1.51m
        82           0.0790            1.50m
        83           0.0789            1.48m
        84           0.0788            1.47m
        85           0.0787            1.45m
        86           0.0787            1.44m
        87           0.0786            1.42m
        88           0.0785            1.41m
        89           0.0784            1.39m
        90           0.0783            1.38m
        91           0.0783            1.36m
        92           0.0782            1.35m
        93           0.0781            1.33m
        94           0.0781            1.32m
        95           0.0779            1.31m
        96           0.0778            1.30m
        97           0.0777            1.28m
        98           0.0777            1.26m
        99           0.0776            1.25m
       100           0.0775            1.24m
       101           0.0775            1.22m
       102           0.0774            1.21m
       103           0.0774            1.19m
       104           0.0773            1.18m
       105           0.0773            1.16m
       106           0.0772            1.15m
       107           0.0771            1.14m
       108           0.0770            1.12m
       109           0.0770            1.11m
       110           0.0770            1.09m
       111           0.0769            1.08m
       112           0.0769            1.06m
       113           0.0768            1.05m
       114           0.0768            1.04m
       115           0.0768            1.02m
       116           0.0767            1.01m
       117           0.0767           59.65s
       118           0.0766           58.89s
       119           0.0766           58.06s
       120           0.0765           57.26s
       121           0.0765           56.62s
       122           0.0764           55.84s
       123           0.0764           54.94s
       124           0.0763           54.21s
       125           0.0763           53.42s
       126           0.0763           52.69s
       127           0.0762           51.84s
       128           0.0762           51.11s
       129           0.0761           50.42s
       130           0.0760           49.71s
       131           0.0760           48.86s
       132           0.0759           48.13s
       133           0.0759           47.29s
       134           0.0758           46.62s
       135           0.0758           45.85s
       136           0.0757           45.07s
       137           0.0757           44.35s
       138           0.0756           43.59s
       139           0.0756           42.74s
       140           0.0756           41.91s
       141           0.0756           41.12s
       142           0.0755           40.37s
       143           0.0755           39.62s
       144           0.0755           38.88s
       145           0.0754           38.24s
       146           0.0754           37.51s
       147           0.0753           36.83s
       148           0.0753           36.05s
       149           0.0753           35.38s
       150           0.0753           34.60s
       151           0.0753           33.85s
       152           0.0752           33.19s
       153           0.0752           32.45s
       154           0.0752           31.74s
       155           0.0752           31.04s
       156           0.0752           30.33s
       157           0.0752           29.56s
       158           0.0751           28.79s
       159           0.0751           28.13s
       160           0.0750           27.44s
       161           0.0750           26.74s
       162           0.0750           26.00s
       163           0.0750           25.24s
       164           0.0750           24.50s
       165           0.0750           23.77s
       166           0.0749           23.07s
       167           0.0749           22.32s
       168           0.0749           21.64s
       169           0.0748           20.96s
       170           0.0747           20.29s
       171           0.0747           19.62s
       172           0.0746           18.92s
       173           0.0746           18.23s
       174           0.0746           17.53s
       175           0.0745           16.85s
       176           0.0745           16.15s
       177           0.0744           15.48s
       178           0.0744           14.79s
       179           0.0744           14.08s
       180           0.0744           13.38s
       181           0.0743           12.71s
       182           0.0743           12.03s
       183           0.0742           11.37s
       184           0.0742           10.70s
       185           0.0742           10.03s
       186           0.0742            9.34s
       187           0.0741            8.65s
       188           0.0741            7.98s
       189           0.0741            7.31s
       190           0.0741            6.63s
       191           0.0741            5.97s
       192           0.0740            5.30s
       193           0.0740            4.63s
       194           0.0739            3.97s
       195           0.0739            3.30s
       196           0.0739            2.64s
       197           0.0739            1.98s
       198           0.0738            1.32s
       199           0.0738            0.66s
       200           0.0738            0.00s
# Best Params {'max_depth': 7, 'min_samples_leaf': 100, 'min_samples_split': 500, 'n_estimators': 200}
# Best Score -0.08082314093831001
# Mean Squared Error (Best Score) 1.328823693523826 secs
In [44]:
plot_feature_importance(best_estimator_, features)
In [45]:
predict(best_estimator_, X_test, y_test)
MSE 0.08103641702806552
MAE 0.21870642240280252
# Prediction Error (RMSE) 1.3293218971160097 secs
# Prediction Error (MAE) 1.2444658757517937 secs
In [46]:
# Save Model
dump(best_estimator_, 'gbdt_delovery_time_v1.joblib') 
Out[46]:
['gbdt_delovery_time_v1.joblib']

Feature Engineering - II

In [47]:
# Create train test split based on date
# Create aggregate features based on past data and transform train/test
data_processed_2 = data.copy(deep=True)
data_processed_2 = data_processed_2.sort_values(by = ['created_at'])
# Take first 80% as training set
cnt = data_processed_2.shape[0]
cnt_train = int(cnt * 0.90)
print(cnt, cnt_train)
data_train = data_processed_2[:cnt_train]
data_test = data_processed_2[cnt_train:]
print(data_train.shape, data_test.shape)
print(data_train['created_at'].head(5))
print(data_train['created_at'].tail(5))
print(data_test['created_at'].head(5))
print(data_test['created_at'].tail(5))
197421 177678
(177678, 27) (19743, 27)
2690     2014-10-18 22:24:15-07:00
43519    2015-01-21 07:22:03-08:00
148754   2015-01-21 07:31:51-08:00
187014   2015-01-21 07:39:16-08:00
10265    2015-01-21 07:40:42-08:00
Name: created_at, dtype: datetime64[ns, US/Pacific]
21618    2015-02-15 12:47:00-08:00
34168    2015-02-15 12:47:00-08:00
186642   2015-02-15 12:47:04-08:00
185979   2015-02-15 12:47:09-08:00
49991    2015-02-15 12:47:10-08:00
Name: created_at, dtype: datetime64[ns, US/Pacific]
65436    2015-02-15 12:47:13-08:00
175534   2015-02-15 12:47:25-08:00
187350   2015-02-15 12:47:31-08:00
78262    2015-02-15 12:47:37-08:00
139332   2015-02-15 12:47:42-08:00
Name: created_at, dtype: datetime64[ns, US/Pacific]
176616   2015-02-17 21:57:51-08:00
100474   2015-02-17 21:58:07-08:00
191692   2015-02-17 21:59:01-08:00
168114   2015-02-17 21:59:23-08:00
61787    2015-02-17 22:00:44-08:00
Name: created_at, dtype: datetime64[ns, US/Pacific]
In [48]:
# Create aggregate features from historical data
# Average delivery time secs for market/store
market_map = data_train[['market_id','delivery_time_secs']].groupby(['market_id']).mean().to_dict()
market_map = market_map['delivery_time_secs']
store_map = data_train[['store_id','delivery_time_secs']].groupby(['store_id']).mean().to_dict()
store_map = store_map['delivery_time_secs']

# Average delivery time secs per Day of Week for market/store
market_dow_map = data_train[['market_id','day','delivery_time_secs']].groupby(['market_id','day']).mean().to_dict()
market_dow_map = market_dow_map['delivery_time_secs']
store_dow_map = data_train[['store_id','day','delivery_time_secs']].groupby(['store_id','day']).mean().to_dict()
store_dow_map = store_dow_map['delivery_time_secs']

# Average delivery time sec per Hour of day for market/store
market_hour_map = data_train[['market_id','hour','delivery_time_secs']].groupby(['market_id','hour']).mean().to_dict()
market_hour_map = market_hour_map['delivery_time_secs']
store_hour_map = data_train[['store_id','hour','delivery_time_secs']].groupby(['store_id','hour']).mean().to_dict()
store_hour_map = store_hour_map['delivery_time_secs']

# Average delivery time secs per Day of Week, Hour of Day for market/store
market_dow_hour_map = data_train[['market_id','day','hour','delivery_time_secs']] \
.groupby(['market_id','day','hour']).mean().to_dict()
market_dow_hour_map = market_dow_hour_map['delivery_time_secs']
store_dow_hour_map = data_train[['store_id','day','hour','delivery_time_secs']] \
.groupby(['store_id','day','hour']).mean().to_dict()
store_dow_hour_map = store_dow_hour_map['delivery_time_secs']


"""
Average estimated_store_to_consumer_driving_duration for every store 
If data for a store isn't available, roll up to market level
***Better Approach: estimated_store_to_consumer_driving_duration per store (or market) per day of week per hr***
"""
market_estimated_store_to_consumer_driving_duration_map = \
        data_train[['market_id','estimated_store_to_consumer_driving_duration']] \
        .groupby('market_id').mean().to_dict()
market_estimated_store_to_consumer_driving_duration_map = \
        market_estimated_store_to_consumer_driving_duration_map['estimated_store_to_consumer_driving_duration']
print(len(market_estimated_store_to_consumer_driving_duration_map))


# Average estimated_store_to_consumer_driving_duration for every store (if store data isn't available, roll up in market level)
store_estimated_store_to_consumer_driving_duration_map = \
        data_train[['store_id','estimated_store_to_consumer_driving_duration']] \
        .groupby('store_id').mean().to_dict()
store_estimated_store_to_consumer_driving_duration_map = \
        store_estimated_store_to_consumer_driving_duration_map['estimated_store_to_consumer_driving_duration']
print(len(store_estimated_store_to_consumer_driving_duration_map))
7
6660
In [49]:
# Replace the NAs with average store to consumer driving duration
def add_aggregate_fatures(row):
    avg_delivery_time_market = market_map.get(row['market_id'], 0)
    avg_delivery_time_store = store_map.get(row['store_id'], 0) 
    
    avg_delivery_time_market_dow = market_dow_map.get((row['market_id'],row['day']), 0)
    avg_delivery_time_store_dow = store_dow_map.get((row['store_id'],row['day']), 0)
    
    avg_delivery_time_market_hour = market_hour_map.get((row['market_id'],row['hour']), 0)
    avg_delivery_time_store_hour = store_hour_map.get((row['store_id'],row['hour']), 0)
    
    avg_delivery_time_market_dow_hour = market_dow_hour_map.get((row['market_id'],row['day'],row['hour']), 0) 
    avg_delivery_time_store_dow_hour = store_dow_hour_map.get((row['store_id'],row['day'],row['hour']), 0)
    
    return pd.Series((avg_delivery_time_market, avg_delivery_time_store, \
           avg_delivery_time_market_dow, avg_delivery_time_store_dow, \
           avg_delivery_time_market_hour, avg_delivery_time_store_hour, \
           avg_delivery_time_market_dow_hour, avg_delivery_time_store_dow_hour))

cols = ['avg_delivery_time_market', 'avg_delivery_time_store', \
        'avg_delivery_time_market_dow', 'avg_delivery_time_store_dow',
        'avg_delivery_time_market_hour', 'avg_delivery_time_store_hour',
        'avg_delivery_time_market_dow_hour', 'avg_delivery_time_store_dow_hour'
       ]

# Transform train/test
data_train[cols] = data_train[['market_id','store_id','day','hour','delivery_time_secs']] \
    .apply(add_aggregate_fatures, axis=1)
data_test[cols] = data_test[['market_id','store_id','day','hour','delivery_time_secs']] \
    .apply(add_aggregate_fatures, axis=1)
    
/Users/ssatpati/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py:3140: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
In [50]:
# Replace the NAs with average store to consumer driving duration
def estimated_store_to_consumer_driving_duration_missing(row):
    ret = row['estimated_store_to_consumer_driving_duration']
    if np.isnan(ret):
        ret = store_estimated_store_to_consumer_driving_duration_map[row['store_id']]
        if np.isnan(ret):
            ret = market_estimated_store_to_consumer_driving_duration_map[row['market_id']]
    return ret


data_train['estimated_store_to_consumer_driving_duration'] = \
    data_train[['market_id','store_id','estimated_store_to_consumer_driving_duration']] \
    .apply(estimated_store_to_consumer_driving_duration_missing, axis=1)
data_test['estimated_store_to_consumer_driving_duration'] = \
    data_test[['market_id','store_id','estimated_store_to_consumer_driving_duration']] \
    .apply(estimated_store_to_consumer_driving_duration_missing, axis=1)
/Users/ssatpati/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
/Users/ssatpati/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
In [51]:
print(data_train.shape, data_test.shape)
(177678, 35) (19743, 35)
In [52]:
for col in cols:
    print(col, data_train[data_train[col] == 0].shape, data_test[data_test[col] == 0].shape)
avg_delivery_time_market (0, 35) (0, 35)
avg_delivery_time_store (0, 35) (126, 35)
avg_delivery_time_market_dow (0, 35) (0, 35)
avg_delivery_time_store_dow (0, 35) (1662, 35)
avg_delivery_time_market_hour (0, 35) (0, 35)
avg_delivery_time_store_hour (0, 35) (1826, 35)
avg_delivery_time_market_dow_hour (0, 35) (4, 35)
avg_delivery_time_store_dow_hour (0, 35) (8295, 35)
In [53]:
X_train, y_train, features = train_X_y(data_train)
X_test, y_test, features = train_X_y(data_test)
(177678, 31) (177678,)
(19743, 31) (19743,)
In [54]:
best_estimator_ = train_model(X_train, y_train)
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=10, warm_start=False)
      Iter       Train Loss   Remaining Time 
         1           0.1142            5.72m
         2           0.0994            5.53m
         3           0.0874            5.48m
         4           0.0777            5.50m
         5           0.0697            5.43m
         6           0.0633            5.36m
         7           0.0581            5.28m
         8           0.0538            5.21m
         9           0.0504            5.14m
        10           0.0475            5.09m
        11           0.0452            5.03m
        12           0.0433            4.96m
        13           0.0418            4.91m
        14           0.0405            4.88m
        15           0.0395            4.83m
        16           0.0386            4.81m
        17           0.0379            4.77m
        18           0.0373            4.71m
        19           0.0368            4.66m
        20           0.0364            4.60m
        21           0.0360            4.56m
        22           0.0357            4.52m
        23           0.0354            4.46m
        24           0.0352            4.41m
        25           0.0350            4.36m
        26           0.0348            4.31m
        27           0.0346            4.26m
        28           0.0345            4.21m
        29           0.0343            4.16m
        30           0.0342            4.12m
        31           0.0340            4.07m
        32           0.0339            4.03m
        33           0.0338            3.99m
        34           0.0337            3.95m
        35           0.0336            3.92m
        36           0.0336            3.87m
        37           0.0335            3.83m
        38           0.0334            3.79m
        39           0.0333            3.74m
        40           0.0333            3.71m
        41           0.0332            3.66m
        42           0.0331            3.63m
        43           0.0330            3.60m
        44           0.0330            3.57m
        45           0.0329            3.53m
        46           0.0328            3.50m
        47           0.0327            3.47m
        48           0.0327            3.45m
        49           0.0326            3.43m
        50           0.0326            3.42m
        51           0.0325            3.41m
        52           0.0325            3.38m
        53           0.0325            3.37m
        54           0.0324            3.35m
        55           0.0324            3.32m
        56           0.0324            3.28m
        57           0.0323            3.27m
        58           0.0323            3.23m
        59           0.0323            3.22m
        60           0.0322            3.20m
        61           0.0322            3.14m
        62           0.0322            3.11m
        63           0.0322            3.08m
        64           0.0322            3.03m
        65           0.0321            2.99m
        66           0.0321            2.96m
        67           0.0321            2.92m
        68           0.0320            2.88m
        69           0.0320            2.83m
        70           0.0320            2.79m
        71           0.0320            2.76m
        72           0.0320            2.72m
        73           0.0320            2.68m
        74           0.0319            2.65m
        75           0.0319            2.62m
        76           0.0319            2.58m
        77           0.0319            2.55m
        78           0.0318            2.52m
        79           0.0318            2.49m
        80           0.0318            2.46m
        81           0.0318            2.42m
        82           0.0317            2.40m
        83           0.0317            2.38m
        84           0.0317            2.35m
        85           0.0317            2.32m
        86           0.0317            2.28m
        87           0.0317            2.25m
        88           0.0317            2.22m
        89           0.0317            2.19m
        90           0.0316            2.17m
        91           0.0316            2.14m
        92           0.0316            2.11m
        93           0.0316            2.08m
        94           0.0316            2.05m
        95           0.0316            2.02m
        96           0.0316            1.99m
        97           0.0316            1.96m
        98           0.0316            1.94m
        99           0.0316            1.91m
       100           0.0316            1.89m
       101           0.0315            1.86m
       102           0.0315            1.83m
       103           0.0315            1.81m
       104           0.0315            1.78m
       105           0.0315            1.76m
       106           0.0315            1.73m
       107           0.0315            1.70m
       108           0.0315            1.68m
       109           0.0315            1.65m
       110           0.0315            1.63m
       111           0.0315            1.60m
       112           0.0315            1.58m
       113           0.0314            1.55m
       114           0.0314            1.53m
       115           0.0314            1.50m
       116           0.0314            1.48m
       117           0.0314            1.46m
       118           0.0314            1.43m
       119           0.0314            1.41m
       120           0.0314            1.39m
       121           0.0314            1.36m
       122           0.0314            1.34m
       123           0.0314            1.32m
       124           0.0314            1.30m
       125           0.0314            1.27m
       126           0.0314            1.25m
       127           0.0314            1.23m
       128           0.0314            1.21m
       129           0.0313            1.19m
       130           0.0313            1.17m
       131           0.0313            1.14m
       132           0.0313            1.12m
       133           0.0313            1.10m
       134           0.0313            1.08m
       135           0.0313            1.06m
       136           0.0313            1.04m
       137           0.0313            1.02m
       138           0.0313            1.00m
       139           0.0313           58.91s
       140           0.0313           57.79s
       141           0.0313           56.66s
       142           0.0313           55.55s
       143           0.0313           54.41s
       144           0.0313           53.26s
       145           0.0312           52.16s
       146           0.0312           51.03s
       147           0.0312           49.95s
       148           0.0312           48.83s
       149           0.0312           47.71s
       150           0.0312           46.62s
       151           0.0312           45.52s
       152           0.0312           44.45s
       153           0.0312           43.44s
       154           0.0312           42.38s
       155           0.0312           41.36s
       156           0.0312           40.33s
       157           0.0312           39.31s
       158           0.0312           38.31s
       159           0.0312           37.32s
       160           0.0312           36.29s
       161           0.0312           35.27s
       162           0.0312           34.28s
       163           0.0312           33.27s
       164           0.0312           32.26s
       165           0.0312           31.27s
       166           0.0312           30.28s
       167           0.0312           29.29s
       168           0.0311           28.31s
       169           0.0311           27.36s
       170           0.0311           26.41s
       171           0.0311           25.49s
       172           0.0311           24.56s
       173           0.0311           23.61s
       174           0.0311           22.69s
       175           0.0311           21.77s
       176           0.0311           20.85s
       177           0.0311           19.94s
       178           0.0311           19.02s
       179           0.0311           18.12s
       180           0.0311           17.22s
       181           0.0311           16.31s
       182           0.0311           15.43s
       183           0.0311           14.55s
       184           0.0311           13.65s
       185           0.0310           12.77s
       186           0.0310           11.91s
       187           0.0310           11.04s
       188           0.0310           10.17s
       189           0.0310            9.31s
       190           0.0310            8.45s
       191           0.0310            7.59s
       192           0.0310            6.73s
       193           0.0310            5.88s
       194           0.0310            5.03s
       195           0.0310            4.18s
       196           0.0310            3.34s
       197           0.0310            2.50s
       198           0.0310            1.66s
       199           0.0310            0.83s
       200           0.0309            0.00s
# Best Params {'max_depth': 7, 'min_samples_leaf': 100, 'min_samples_split': 500, 'n_estimators': 200}
# Best Score -0.036390645264546134
# Mean Squared Error (Best Score) 1.210172997633434 secs
In [55]:
plot_feature_importance(best_estimator_, features)
In [56]:
predict(best_estimator_, X_test, y_test)
MSE 0.7119406096739447
MAE 0.6531295632036306
# Prediction Error (RMSE) 2.325106237426039 secs
# Prediction Error (MAE) 1.9215450254630602 secs
In [57]:
# Save Model
dump(best_estimator_, 'gbdt_delovery_time_v2.joblib') 
Out[57]:
['gbdt_delovery_time_v2.joblib']
In [58]:
y_hat = best_estimator_.predict(X_test)
In [59]:
y_hat = pd.Series(np.exp(y_hat))
In [60]:
y_hat.describe()
Out[60]:
count   19743.000
mean     1976.751
std      1205.930
min       717.143
25%       829.558
50%      1935.055
75%      2804.064
max     16757.044
dtype: float64
In [66]:
fig = plt.figure(figsize=(10, 10), dpi=80)
pred_df = pd.DataFrame({'y_test':np.log(y_test), 'y_hat':np.log(y_hat)})
sns.scatterplot(x="y_test", y="y_hat", data=pred_df)
Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a26ca6908>
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: